import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # visualization
import seaborn as sns # visualization
import plotly.graph_objects as go
from scipy import stats
from scipy.stats import norm
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
import xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, ShuffleSplit, LeaveOneOut
from sklearn.metrics import confusion_matrix
from sklearn.externals.joblib import Parallel, parallel_backend
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import dask_ml.datasets
import dask_ml.cluster
from distributed import Executor
from joblib import Parallel, parallel_backend
from scipy.stats import zscore
from sklearn.preprocessing import QuantileTransformer
import warnings
warnings.filterwarnings('ignore') #ignore warnings
%matplotlib inline
import gc
import time
# Imports in order to be able to use Plotly offline.
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import chart_studio.plotly as py
import cufflinks as cf
from plotly.tools import FigureFactory as FF
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
print(__version__) # requires version >= 1.9.0
init_notebook_mode(connected=True)
from dask.distributed import Client
client = Client()
client
pd.set_option('display.max_columns', 500)
train = pd.read_csv('/Users/yashpasar/Downloads/walmart-recruiting-store-sales-forecasting/train.csv')
test = pd.read_csv('/Users/yashpasar/Downloads/walmart-recruiting-store-sales-forecasting/test.csv')
stores = pd.read_csv('/Users/yashpasar/Downloads/walmart-recruiting-store-sales-forecasting/stores.csv')
features = pd.read_csv('/Users/yashpasar/Downloads/walmart-recruiting-store-sales-forecasting/features.csv')
train.head()
stores.head()
features.head()
df = train.merge(stores, how='left').merge(features, how='left')
df.head()
df.shape
df['IsHoliday'].value_counts()
df_true = df[df['IsHoliday']== True]
df = df[df['IsHoliday']== False]
df = df.sample(frac=0.15, replace=True, random_state=1)
df = df.append(df_true)
df.shape
df['IsHoliday'].value_counts()
df = df.reset_index()
df.head()
df.drop(['index'], axis=1,inplace=True)
df.head()
print('Columns with null values:' ,sum(list(df.isnull().any())))
df.isnull().any()
#Replace null values with 0
df.fillna(0,inplace=True)
df.head()
df.Date = pd.to_datetime(df.Date)
df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Week'] = df.Date.dt.week
df['Day'] = df.Date.dt.day
df.head()
df.describe()
df.info()
# Calculate correlations
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
# Heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr,
vmax=.5,
mask=mask,
annot=True, fmt='.2f',
linewidths=.2, cmap="YlGnBu")
df.shape
print("Number of Stores - ", df['Store'].nunique())
print("Number of Departments - ", df['Dept'].nunique())
#Count of each Store
df.Type.value_counts()
#Avergae size of each Type of Store
df.groupby('Type').describe()['Size'].round(2)
df1 = df.set_index('Date')
df1.head()
y = df1['Weekly_Sales'].resample('MS').mean()
from pylab import rcParams
import statsmodels.api as sm
rcParams['figure.figsize'] = 18, 10
decomposition = sm.tsa.seasonal_decompose(y, model='additive')
fig = decomposition.plot()
plt.show()
#labels = ['Oxygen','Hydrogen','Carbon_Dioxide','Nitrogen']
#values = [4500, 2500, 1053, 500]
labels=['Store A','Store B','Store C']
sizes=df.describe()['Size'].round(1)
sizes=[(22/(17+6+22))*100,(17/(17+6+22))*100,(6/(17+6+22))*100]
fig = go.Figure(data=[go.Pie(labels=labels, values=sizes)])
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="Store by sizes")
fig.show()
import plotly.express as px
#ts = pd.concat([df['Type'], df['Size'])
fig = px.box(df, x = 'Type', y='Size', color='Type',
title="Size of each type of store"
)
fig.show()
# By boxplot, we can infer that type A store is the largest store and C is the smallest
# Even more, there is no overlapped area in size among A, B, and C. Type is the best predictor for Size
#Department wise Sales
Dept_sales = df[df['IsHoliday'] == False].groupby(df.Dept)['Weekly_Sales'].mean()
#Department wise Holiday Sales
Dept_holiday_sales = df[df['IsHoliday'] == True].groupby(df.Dept)['Weekly_Sales'].mean()
ts = pd.concat([df.Type, df.Weekly_Sales], axis=1)
fig = px.box(ts, x = ts.Type, y=ts.Weekly_Sales, color=ts.Type,
title="Weekly Sales of each type of store"
)
fig.show()
df['outlier'] = np.where((zscore(df['Weekly_Sales']) <= -2.5) | (zscore(df['Weekly_Sales']) >= 2.5), 1, 0)
num_outliers = df[df['outlier'] == 1]['Weekly_Sales'].count()
print('Number of `Weekly_Sales` outliers: {}\nPercent outliers: {:.2f}%'.format(num_outliers, num_outliers / df.shape[0] * 100))
qt = QuantileTransformer(output_distribution='normal')
df_copy=df.copy()
#Distribution plot is exponential for
sns.distplot(df_copy['Weekly_Sales'], bins=10).set_title('Weekly Sales')
df_copy['Weekly_Sales'] = qt.fit_transform(np.array(df_copy['Weekly_Sales'] + 1).reshape(-1, 1))
df_copy['outlier'] = np.where((zscore(df_copy['Weekly_Sales']) <= -2.5) | (zscore(df_copy['Weekly_Sales']) >= 2.5), 1, 0)
num_outliers = df_copy[df_copy['outlier'] == 1]['Weekly_Sales'].count()
print('QuantileTransformer - Number of `Weekly_Sales` outliers: {}\nPercent outliers: {:.2f}%'.format(num_outliers, num_outliers / df_copy.shape[0] * 100))
# We conclude that the percent of outliers decreases from 3.26% to 1.18% so we need to apply transformation on our data
#Applying Quantile Transformer on original dataset
df['Weekly_Sales'] = qt.fit_transform(np.array(df['Weekly_Sales'] + 1).reshape(-1, 1))
# Plotting after Transformation
sns.distplot(df['Weekly_Sales'], bins=10).set_title('(1+Weekly Sales)\nQauntile Transformer\nno outliers')
ts = pd.concat([df.Type, df.Weekly_Sales], axis=1)
fig = px.box(ts, x = ts.Type, y=ts.Weekly_Sales, color=ts.Type,
title="Weekly Sales of each type of store"
)
fig.show()
# The median of A is the highest and C is the lowest.
# That means stores with more sizes have higher sales record because the order of median of size and median of sales
# is the same
# We observe that post transformation our data is less skewed
fig = px.box(df, x = 'Month', y='Weekly_Sales', color='IsHoliday',
title="Weekly Salse by Month"
)
fig.show()
fig = px.box(df, x = 'Dept', y='Weekly_Sales',
title="Weekly Sales by Month"
) # horizontal box plots
fig.show()
Dept_sales_pd = pd.DataFrame(Dept_sales).apply(lambda x: '%.5f' % x, axis=1).reset_index()
Dept_sales_pd.columns = ['Dept', 'Weekly_Sales']
Dept_holiday_sales_pd = pd.DataFrame(Dept_holiday_sales).apply(lambda x: '%.5f' % x, axis=1).reset_index()
Dept_holiday_sales_pd.columns = ['Dept', 'Weekly_Holiday_Sales']
Dept_sales.sort_values()
#We found from the above analysis that department number 47 has negative Sales.
#We further drill down to see the negative weekly sales for this department.
df[df['Dept'] == 47]
# Negative values need to be removed to be able to implement log transformation
df.loc[df['Weekly_Sales'] < 0.0,'Weekly_Sales'] = 0.0
#Department wise Sales
Dept_sales = df[df['IsHoliday'] == False].groupby(df.Dept)['Weekly_Sales'].mean()
#Department wise Holiday Sales
Dept_holiday_sales = df[df['IsHoliday'] == True].groupby(df.Dept)['Weekly_Sales'].mean()
Dept_sales_pd = pd.DataFrame(Dept_sales).apply(lambda x: '%.5f' % x, axis=1).reset_index()
Dept_sales_pd.columns = ['Dept', 'Weekly_Sales']
Dept_holiday_sales_pd = pd.DataFrame(Dept_holiday_sales).apply(lambda x: '%.5f' % x, axis=1).reset_index()
Dept_holiday_sales_pd.columns = ['Dept', 'Weekly_Holiday_Sales']
Dept_sales.sort_values()
Dept_holiday_sales.sort_values()
df_holidays = Dept_holiday_sales_pd.sort_values(by='Weekly_Holiday_Sales')
Dept_holiday_sales.sort_values().iplot(kind='bar', title='Holiday Sales by Department')
plt.show()
Dept_sales_pd = Dept_sales_pd.merge(Dept_holiday_sales_pd)
Dept_sales_pd.dtypes
Dept_sales_pd = Dept_sales_pd.astype({'Weekly_Sales' : 'float', 'Weekly_Holiday_Sales' : 'float'})
Dept_sales_pd['Percent_Increase'] = ((Dept_sales_pd.Weekly_Holiday_Sales - Dept_sales_pd.Weekly_Sales)/ (Dept_sales_pd.Weekly_Sales))*100
# These departments show a significant change during holidays
Dept_sales_pd[Dept_sales_pd['Percent_Increase'] > 55]
#Department wise sales comparison for holiday and no-holiday
data = pd.concat([df.Dept, df.Weekly_Sales, df.IsHoliday], axis=1)
f, ax = plt.subplots(figsize=(30, 10))
fig = sns.barplot(x='Dept', y="Weekly_Sales", data=data, hue='IsHoliday')
#Converting holiday to binary values
df['IsHoliday'] = df['IsHoliday'].apply(lambda x: 1 if x==True else 0)
#Applying one hot encoding on Type
df = pd.get_dummies(df, columns=['Type'])
df.drop(columns=['Date', "CPI", "Fuel_Price", 'Unemployment', 'MarkDown4', 'Type_A', 'outlier'], inplace=True)
#Replacing Negative Values of MarkDown by 0
df.loc[df['MarkDown1'] < 0.0,'MarkDown1'] = 0.0
df.loc[df['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
df.loc[df['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
#df.loc[df['MarkDown4'] < 0.0,'MarkDown4'] = 0.0
df.loc[df['MarkDown5'] < 0.0,'MarkDown5'] = 0.0
df.head()
test = test.merge(stores, how='left').merge(features, how='left')
print('Columns with null values:' ,sum(list(test.isnull().any())))
test.isnull().any()
test.fillna(0,inplace=True)
test.Date = pd.to_datetime(test.Date)
test['Year'] = test.Date.dt.year
test['Month'] = test.Date.dt.month
test['Week'] = test.Date.dt.week
test['Day'] = test.Date.dt.day
#test['no_days'] = (test.Date.dt.date - test.Date.dt.date.min()).apply(lambda x:x.days)
test = pd.get_dummies(test, columns=['Type'])
#Converting holiday to binary values
test['IsHoliday'] = test['IsHoliday'].apply(lambda x: 1 if x==True else 0)
test.loc[test['MarkDown1'] < 0.0,'MarkDown1'] = 0.0
test.loc[test['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
test.loc[test['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
test.loc[test['MarkDown4'] < 0.0,'MarkDown4'] = 0.0
test.loc[test['MarkDown5'] < 0.0,'MarkDown5'] = 0.0
date = test.Date
test.drop(["Date", "CPI", "Fuel_Price", 'Unemployment', 'MarkDown4', 'Type_A'], axis=1, inplace=True)
test.head()
df_2y = df[df['Year'] != 2012 ]
df_2012 = df[df['Year'] == 2012 ]
X_train = df_2y.drop('Weekly_Sales', axis=1)
y_train = df_2y['Weekly_Sales']
X_valid = df_2012.drop('Weekly_Sales', axis=1)
y_valid = df_2012['Weekly_Sales']
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn import preprocessing
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)
X_train = pd.DataFrame(X_train_std, index = X_train.index, columns = X_train.columns)
X_valid = pd.DataFrame(X_valid_std, index = X_valid.index, columns = X_valid.columns)
import time
start = time.time()
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
end = time.time()
time_elapsed = end - start
time_elapsed
def weighted_mae_custom(y_true, y_pred):
'''
Custom weighting function as specified in the evaluation section.
'''
weights = X_valid['IsHoliday']
sample_weights = pd.Series(weights.loc[y_true.index.values].values.reshape(-1)).dropna()
return (1.0 / np.sum(sample_weights)) * np.sum(sample_weights * np.abs(y_true - y_pred))
weighted_mae = make_scorer(weighted_mae_custom)
weighted_mae(knn, X_valid, y_valid)
def weighted_mae_custom(y_true, y_pred):
'''
Custom weighting function as specified in the evaluation section.
'''
weights = X_train['IsHoliday']
sample_weights = pd.Series(weights.loc[y_true.index.values].values.reshape(-1)).dropna()
return (1.0 / np.sum(sample_weights)) * np.sum(sample_weights * np.abs(y_true - y_pred))
weighted_mae1 = make_scorer(weighted_mae_custom)
params = {'n_neighbors': [2, 3, 4], 'p': [1,2]}
with parallel_backend('dask'):
gs_cv = GridSearchCV(knn,
param_grid = params,
cv=3,
scoring=weighted_mae1,
n_jobs=-1,
iid=True,
verbose=2).fit(X_train, y_train)
gs_cv.best_params_
weighted_mae(gs_cv, X_valid, y_valid)
start = time.time()
clf = svm.SVR()
svm_pipeline = make_pipeline(scaler, clf)
with parallel_backend('dask'):
svm_pipeline.fit(X_train, y_train)
end = time.time()
time_elapsed = end - start
time_elapsed
weighted_mae(clf, X_valid, y_valid)
param_grid = {'gamma' : [1, 0.1, 0.001],
'C' : [0.25,0.5,1]}
grid = GridSearchCV(svm_pipeline[1],
param_grid,
cv=3,
scoring=weighted_mae1,
n_jobs=-1,
iid=True,
verbose=2)
with parallel_backend('dask'):
grid.fit(X_train, y_train)
grid.best_params_
weighted_mae(grid, X_valid, y_valid)
start = time.time()
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
end = time.time()
time_elapsed = end - start
time_elapsed
weighted_mae(rf, X_valid, y_valid)
feature_imp = pd.DataFrame(rf.feature_importances_, index=X_train.columns,
columns=['importance']).sort_values('importance', ascending=False)
feature_imp
params = {'n_estimators': [100,250,500],
'max_features' : ['log2','auto','sqrt']}
gs_rf = GridSearchCV(rf,
param_grid = params,
cv=3,
scoring=weighted_mae1,
n_jobs=-1,
iid=True,
verbose=2).fit(X_train, y_train)
df_results = pd.DataFrame(gs_rf.cv_results_)
df_results.pivot(index='param_n_estimators',
columns='param_max_features',
values='mean_test_score').round(3).style.background_gradient('coolwarm', axis=None)
gs_rf.best_params_
weighted_mae(gs_rf, X_valid, y_valid)
n_estimators=gs_rf.best_params_['n_estimators']
max_features=gs_rf.best_params_['max_features']
rf1 = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features)
rf1.fit(X_train, y_train)
feature_imp = pd.DataFrame(rf1.feature_importances_, index=X_train.columns,
columns=['importance']).sort_values('importance', ascending=False)
feature_imp
test['Weekly_Sales'] = qt.inverse_transform(gs_rf.predict(test).reshape(-1, 1)) + 1
test['Date'] = date
test.head()
from sklearn.externals import joblib
joblib.dump(test, 'rf_test.pkl')
import time
start = time.time()
lr = LinearRegression()
lr.fit(X_train, y_train)
end = time.time()
time_elapsed = end - start
time_elapsed
weighted_mae(lr, X_valid, y_valid)
myarray = np.asarray(lr.coef_)
coefficient_imp = pd.DataFrame(list(zip(X_train.columns, myarray)),
columns = ['column', 'Coefficients']).sort_values('Coefficients', ascending=False)
coefficient_imp['Coefficients'] = coefficient_imp['Coefficients'].apply(lambda x: '{:.2f}'.format(x))
coefficient_imp
params = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
gs_lr = GridSearchCV(lr,
param_grid = params,
cv=3,
scoring=weighted_mae1,
n_jobs=-1,
iid=True,
verbose=2)
gs_lr.fit(X_train,y_train)
gs_lr.best_params_
fit_intercept = gs_lr.best_params_['fit_intercept']
normalize=gs_lr.best_params_['normalize']
copy_X=gs_lr.best_params_['copy_X']
lr1=LinearRegression(fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X)
lr1.fit(X_train, y_train)
weighted_mae(lr1, X_valid, y_valid)
myarray = np.asarray(lr1.coef_)
coefficient_imp = pd.DataFrame(list(zip(X_train.columns, myarray)),
columns = ['column', 'Coefficients']).sort_values('Coefficients', ascending=False)
coefficient_imp['Coefficients'] = coefficient_imp['Coefficients'].apply(lambda x: '{:.2f}'.format(x))
coefficient_imp
fig = go.Figure(data=[
go.Bar(name='LR', x=X_train.columns, y=coefficient_imp.Coefficients),
go.Bar(name='RF', x=X_train.columns, y=feature_imp.importance)
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(
title_text="Feature Comparison between Linear Regression and Random Forest")
fig.show()
name = ['K Nearest Neighbor', 'Support Vector Machine', 'Random Forest', 'Linear Regression']
time = [8.8, 180.31, 2.62, 0.039]
gs_time = [300, 1507, 306, 0.8]
wmae = [0.1093, 0.1096, 0.064, 0.122]
result_df = pd.DataFrame(list(zip(name, time, gs_time, wmae)),
columns =['Model', 'Initial model Time (secs)', 'Grid search time (secs)', 'WMAE'])
result_df.head()
fig = px.bar(result_df, x='Model', y='Grid search time (secs)', color='WMAE', height=400)
fig.update_layout(
title_text="Overall Model Comaprison")
fig.show()